In [1]:
pip install pandas-profiling
Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.
Requirement already satisfied: pandas-profiling in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (3.6.6)
Requirement already satisfied: ydata-profiling in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from pandas-profiling) (4.0.0)
Requirement already satisfied: scipy<1.10,>=1.4.1 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (1.7.3)
Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (0.12.3)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (6.0)
Requirement already satisfied: statsmodels<0.14,>=0.13.2 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (0.13.2)
Requirement already satisfied: multimethod<1.10,>=1.4 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (1.9.1)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (2.11.3)
Requirement already satisfied: requests<2.29,>=2.24.0 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (2.27.1)
Requirement already satisfied: tqdm<4.65,>=4.48.2 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (4.64.0)
Requirement already satisfied: typeguard<2.14,>=2.13.2 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (2.13.3)
Requirement already satisfied: numpy<1.24,>=1.16.0 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (1.21.5)
Requirement already satisfied: seaborn<0.13,>=0.10.1 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (0.11.2)
Requirement already satisfied: pydantic<1.11,>=1.8.1 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (1.10.4)
Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (1.4.2)
Requirement already satisfied: matplotlib<3.7,>=3.2 in c:\programdata\anaconda3\lib\site-packages (from ydata-profiling->pandas-profiling) (3.5.1)
Requirement already satisfied: visions[type_image_path]==0.7.5 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (0.7.5)
Requirement already satisfied: htmlmin==0.1.12 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from ydata-profiling->pandas-profiling) (0.1.12)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (0.2.0)
Requirement already satisfied: attrs>=19.3.0 in c:\programdata\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (21.4.0)
Requirement already satisfied: networkx>=2.4 in c:\programdata\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (2.7.1)
Requirement already satisfied: imagehash in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (4.3.1)
Requirement already satisfied: Pillow in c:\programdata\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (9.0.1)
Requirement already satisfied: MarkupSafe>=0.23 in c:\programdata\anaconda3\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling->pandas-profiling) (2.0.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (1.3.2)
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (0.11.0)
Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (3.0.4)
Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (4.25.0)
Requirement already satisfied: pytz>=2020.1 in c:\programdata\anaconda3\lib\site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata-profiling->pandas-profiling) (2021.3)
Requirement already satisfied: joblib>=0.14.1 in c:\programdata\anaconda3\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling->pandas-profiling) (1.1.0)
Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\dorothy.lo\appdata\roaming\python\python39\site-packages (from pydantic<1.11,>=1.8.1->ydata-profiling->pandas-profiling) (4.4.0)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib<3.7,>=3.2->ydata-profiling->pandas-profiling) (1.16.0)
Requirement already satisfied: charset-normalizer~=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling->pandas-profiling) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling->pandas-profiling) (1.26.9)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling->pandas-profiling) (2021.10.8)
Requirement already satisfied: idna<4,>=2.5 in c:\programdata\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling->pandas-profiling) (3.3)
Requirement already satisfied: patsy>=0.5.2 in c:\programdata\anaconda3\lib\site-packages (from statsmodels<0.14,>=0.13.2->ydata-profiling->pandas-profiling) (0.5.2)
Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from tqdm<4.65,>=4.48.2->ydata-profiling->pandas-profiling) (0.4.4)
Requirement already satisfied: PyWavelets in c:\programdata\anaconda3\lib\site-packages (from imagehash->visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (1.3.0)

In [2]:
import pandas as pd
df = pd.read_csv("application_record.csv")
In [3]:
df2 = pd.read_csv("credit_record.csv")
In [4]:
df.describe(include='all')
Out[4]:
ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
count 4.385570e+05 438557 438557 438557 438557.000000 4.385570e+05 438557 438557 438557 438557 438557.000000 438557.000000 438557.0 438557.000000 438557.000000 438557.000000 304354 438557.000000
unique NaN 2 2 2 NaN NaN 5 5 5 6 NaN NaN NaN NaN NaN NaN 18 NaN
top NaN F N Y NaN NaN Working Secondary / secondary special Married House / apartment NaN NaN NaN NaN NaN NaN Laborers NaN
freq NaN 294440 275459 304074 NaN NaN 226104 301821 299828 393831 NaN NaN NaN NaN NaN NaN 78240 NaN
mean 6.022176e+06 NaN NaN NaN 0.427390 1.875243e+05 NaN NaN NaN NaN -15997.904649 60563.675328 1.0 0.206133 0.287771 0.108207 NaN 2.194465
std 5.716370e+05 NaN NaN NaN 0.724882 1.100869e+05 NaN NaN NaN NaN 4185.030007 138767.799647 0.0 0.404527 0.452724 0.310642 NaN 0.897207
min 5.008804e+06 NaN NaN NaN 0.000000 2.610000e+04 NaN NaN NaN NaN -25201.000000 -17531.000000 1.0 0.000000 0.000000 0.000000 NaN 1.000000
25% 5.609375e+06 NaN NaN NaN 0.000000 1.215000e+05 NaN NaN NaN NaN -19483.000000 -3103.000000 1.0 0.000000 0.000000 0.000000 NaN 2.000000
50% 6.047745e+06 NaN NaN NaN 0.000000 1.607805e+05 NaN NaN NaN NaN -15630.000000 -1467.000000 1.0 0.000000 0.000000 0.000000 NaN 2.000000
75% 6.456971e+06 NaN NaN NaN 1.000000 2.250000e+05 NaN NaN NaN NaN -12514.000000 -371.000000 1.0 0.000000 1.000000 0.000000 NaN 3.000000
max 7.999952e+06 NaN NaN NaN 19.000000 6.750000e+06 NaN NaN NaN NaN -7489.000000 365243.000000 1.0 1.000000 1.000000 1.000000 NaN 20.000000
In [5]:
df2.describe(include='all')
Out[5]:
ID MONTHS_BALANCE STATUS
count 1.048575e+06 1.048575e+06 1048575
unique NaN NaN 8
top NaN NaN C
freq NaN NaN 442031
mean 5.068286e+06 -1.913700e+01 NaN
std 4.615058e+04 1.402350e+01 NaN
min 5.001711e+06 -6.000000e+01 NaN
25% 5.023644e+06 -2.900000e+01 NaN
50% 5.062104e+06 -1.700000e+01 NaN
75% 5.113856e+06 -7.000000e+00 NaN
max 5.150487e+06 0.000000e+00 NaN
In [6]:
from pandas_profiling import ProfileReport
C:\Users\dorothy.lo\AppData\Local\Temp\ipykernel_4192\2274191625.py:1: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
  from pandas_profiling import ProfileReport
In [7]:
profile = ProfileReport(df)
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[7]:

In [8]:
profile = ProfileReport(df2)
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[8]:

In [11]:
print((len(df)))
print((len(df.ID.unique())))
#there's duplicates records on applications
438557
438510
In [12]:
print((len(df2)))
print((len(df2.ID.unique())))
#there's many duplicates records on clients
1048575
45985
In [13]:
len(set(df['ID']).intersection(set(df2['ID'])))
#the number of unique intersect client application records
Out[13]:
36457
In [ ]: